# Load the following packages needed for modeling in this assignment
require(caret)
## Loading required package: caret
## Loading required package: ggplot2
## Loading required package: lattice
require(recipes)
## Loading required package: recipes
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
##
## step
require(finalfit)
## Loading required package: finalfit
require(glmnet)
## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 4.1-2
require(ModelMetrics)
## Loading required package: ModelMetrics
##
## Attaching package: 'ModelMetrics'
## The following objects are masked from 'package:caret':
##
## confusionMatrix, precision, recall, sensitivity, specificity
## The following object is masked from 'package:base':
##
## kappa
# Import the tweet dataset with embeddings
tweet <- read.csv('https://raw.githubusercontent.com/uo-datasci-specialization/c4-ml-fall-2021/main/content/post/hw2/data/hw1_tweet_final.csv',header=TRUE)
# Recipe for the tweet dataset
blueprint_tweet <- recipe(x = tweet,
vars = colnames(tweet),
roles = c('outcome',rep('predictor',772))) %>%
step_dummy('month',one_hot=TRUE) %>%
step_harmonic('day',frequency=1,cycle_size=7, role='predictor') %>%
step_harmonic('date',frequency=1,cycle_size=31,role='predictor') %>%
step_harmonic('hour',frequency=1,cycle_size=24,role='predictor') %>%
step_normalize(paste0('Dim',1:768)) %>%
step_normalize(c('day_sin_1','day_cos_1',
'date_sin_1','date_cos_1',
'hour_sin_1','hour_cos_1')) %>%
step_num2factor(sentiment,
transform = function(x) x + 1,
levels=c('Negative','Positive'))
# Notice that I explicitly specified role=predictor when using
# step_harmonic function. This assures that the newly derived sin and cos
# variables has a defined role.
# You need to do this otherwise caret::train function breaks.
# caret_train requires every variable in the recipe to have a role
# You can run the following code and make sure every variable has a defined
# role. If you want to experiment, remove the role=predictor argument
# in the step_harmonic function, create the recipe again, and run the following
# you will see that the new sin and cos variables have NA in the column role
# and this breaks the caret::train function later.
# Also, in the last line, we transform the outcome variable 'sentiment' to
# a factor with labels.
# This seems necessary for fitting logistic regression via caret::train
print(blueprint_tweet %>% prep() %>% summary)
## # A tibble: 781 x 4
## variable type role source
## <chr> <chr> <chr> <chr>
## 1 sentiment nominal outcome original
## 2 day numeric predictor original
## 3 date numeric predictor original
## 4 hour numeric predictor original
## 5 Dim1 numeric predictor original
## 6 Dim2 numeric predictor original
## 7 Dim3 numeric predictor original
## 8 Dim4 numeric predictor original
## 9 Dim5 numeric predictor original
## 10 Dim6 numeric predictor original
## # ... with 771 more rows
Split the original data into two subsets: training and test. Let the training data have the 80% of cases and the test data have the 20% of the cases.
set.seed(11142021) # for reproducibility
loc <- sample(1:nrow(tweet), round(nrow(tweet) * 0.8))
tweet_train <- tweet[loc, ]
tweet_test <- tweet[-loc, ]
tweet_train